knitr::opts_chunk$set(echo = TRUE)
options(stringsAsFactors = FALSE)
library(tidyverse)
theme_set(theme_classic())
theme_update(plot.title = element_text(face="bold"))
library(cowplot)
library(DT)
library(class)
library(MASS)
library(randomForest)
library(DESeq2)
compute <- FALSE
data <- read.delim2("Species_Habit_Orthogroups.GeneCount.June23.tsv", sep = "\t") %>%
filter(!(Species %in% c("Drimys", "Macint", "Vitvin", "Aratha")))
cat("Raw data: Species x OrthoGroups: ", paste0(dim(data), collapse = " x "))
## Raw data: Species x OrthoGroups: 32 x 36519
idx <- colSums(data > 0) >= 5
data <- data[,idx]
cat("Ortholog in at least five species: Species x OrthoGroups: ", paste0(dim(data), collapse = " x "))
## Ortholog in at least five species: Species x OrthoGroups: 32 x 13779
# Gene information - orthogroups
orthogroups <- read.delim2("Orthogroups.tsv", sep = "\t", header = TRUE) %>%
dplyr::select(Orthogroup, Poptre.SHORT.pep) %>%
dplyr::rename(OrthoGroup = Orthogroup, Pt = Poptre.SHORT.pep) %>%
filter(Pt != "") %>%
arrange(OrthoGroup) %>%
separate_rows(Pt, sep = ", ") %>%
separate(Pt, into = c("Pt"), sep = "\\.", extra = "drop") %>%
mutate(Pt = gsub("Poptre_", "", Pt))
gene.info <- read.delim2("gene_info.txt") %>%
dplyr::rename(Pt = gene_id, Descr = description) %>%
dplyr::select(Pt, Descr)
orthogroups <- left_join(orthogroups, gene.info, "Pt") %>%
mutate(Descr = gsub("\\S+\\|\\S+\\|\\S+\\s", "", Descr)) %>%
mutate(Descr = gsub(" OS=.+$", "", Descr))
Use a statistical test to identify ortholog groups with differences in copy numbers between woody and herbaceous growth form.
DOGs: Differential Ortholog Groups
dds <- DESeqDataSetFromMatrix(countData = t(data[, 3:ncol(data)]),
colData = DataFrame(condition = as.factor(data$Habit)),
formula(~ condition))
dds <- DESeq(dds)
degres <- results(dds) %>%
as.data.frame %>%
rownames_to_column(var = "OrthoGroups") %>%
drop_na(pvalue) %>%
mutate(padj = p.adjust(pvalue, method = "BH")) %>%
arrange(padj)
nDOGs <- nrow(degres %>% filter(padj <= 0.1))
cat("DOGs: ", nDOGs, "\n")
## DOGs: 26
degres <- degres %>%
dplyr::select(-baseMean, -lfcSE, -pvalue) %>%
filter(padj <= 0.1) %>%
mutate(log2FoldChange = round(log2FoldChange, digits = 3)) %>%
mutate(stat = round(stat, digits = 3)) %>%
mutate(padj = format(padj, digits = 3, scientific = TRUE))
datatable(degres, rownames = FALSE, filter = "top",
options = list(
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
if (FALSE) {
plots <- list()
for (i in 1:nDOGs) {
orth <- sym(degres$OrthoGroups[i])
p <- degres$padj[i]
plots[[i]] <- ggplot(data, aes(x = Habit, y = !!orth, col = Habit)) +
theme_bw() + theme() +
geom_boxplot(outlier.shape = NA) +
geom_jitter(color="black", size=1, alpha=0.9, shape=16, width = 0.1, height = 0) +
geom_text(x=1.5, y=max(data[[degres$OrthoGroups[i]]]), col = "black",
label=paste("p = ", p))
}
plot_grid(plotlist = plots, ncol = 3)
}
Use machine learning to identify ortholog groups with differences in copy numbers between woody and herbaceous growth form.
SOGs: Significant Ortholog Groups
set.seed(3)
# Random forest
if (compute) {
fit <- randomForest(Habit ~ ., data=cbind(data[, 3:ncol(data)],
data.frame(Habit = as.factor(data$Habit))),
importance=TRUE, proximity=TRUE,
ntree=1000)
confussion.matrix <- table(data$Habit, fit$predicted)
accuracy <- sum(data$Habit == fit$predicted)/nrow(data)
feature.importance <- as.data.frame(importance(fit))
# Randomize
idx <- sample(1:nrow(data), nrow(data), replace = FALSE)
fit.rand <- randomForest(Habit ~ ., data=cbind(data[, 3:ncol(data)],
data.frame(Habit = as.factor(data$Habit[idx]))),
importance=TRUE, proximity=TRUE,
ntree=1000)
table(data$Habit[idx], fit.rand$predicted)
sum(data$Habit[idx] == fit.rand$predicted)/nrow(data)
rand.scores <- importance(fit.rand)[,"MeanDecreaseGini"]
# Save
save(confussion.matrix, accuracy, feature.importance, rand.scores, file = "RData/rf_randomize.RData")
} else {
load(file = "RData/rf_randomize-without-outgroups.RData")
}
confussion.matrix
##
## HA WP
## HA 4 7
## WP 1 20
accuracy
## [1] 0.75
feature.importance <- feature.importance %>%
rownames_to_column(var = "OrthoGroups") %>%
arrange(desc(MeanDecreaseGini)) %>%
filter(MeanDecreaseGini > 0) %>%
rowwise() %>%
mutate(pvalue = (1+sum(rand.scores >= MeanDecreaseGini))/(length(rand.scores)+1)) %>%
ungroup() %>%
mutate(padj = p.adjust(pvalue, method = "BH"))
nSOGs <- nrow(feature.importance %>% filter(padj <= 0.1))
cat("SOGs: ", nSOGs, "\n")
## SOGs: 101
feature.importance <- feature.importance %>%
dplyr::select(-MeanDecreaseAccuracy, -pvalue) %>%
filter(padj <= 0.1) %>%
mutate(HA = round(HA, digits = 3)) %>%
mutate(WP = round(WP, digits = 3)) %>%
mutate(MeanDecreaseGini = round(MeanDecreaseGini, digits = 3)) %>%
mutate(padj = format(padj, digits = 3, scientific = TRUE))
datatable(feature.importance,
rownames = FALSE, filter = "top",
options = list(
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
if (FALSE) {
plots <- list()
for (i in 1:nSOGs) {
orth <- sym(feature.importance$OrthoGroups[i])
plots[[i]] <- ggplot(data, aes(x = Habit, y = !!orth, col = Habit)) +
theme_bw() + theme() +
geom_boxplot(outlier.shape = NA) +
geom_jitter(color="black", size=1, alpha=0.9, shape=16, width = 0.1, height = 0)
}
plot_grid(plotlist = plots, ncol = 3)
}
Ortholog groups identified by one of the two methods:
Box plots show copy numbers in the two growth forms
Line plots show AspWood expression of the most highly expressed aspen gene in the ortholog group.
OBS: Expression plots for all aspen genes can be found here.
candidate_orthogroups <- intersect(degres$OrthoGroups, feature.importance$OrthoGroups)
cat("Intersection DOGs and SOGs:", length(candidate_orthogroups))
## Intersection DOGs and SOGs: 16
candidate_orthogroups <- rbind(degres %>% dplyr::select(OrthoGroups, padj),
feature.importance %>% dplyr::select(OrthoGroups, padj)) %>%
arrange(as.numeric(padj)) %>%
distinct(OrthoGroups) %>%
pull(OrthoGroups)
aspwood_expr <- read.delim("../../DATA/AspWood_transcriptomics_Ptremula.txt",
header = TRUE, sep = "\t") %>%
dplyr::rename(Pt = Genes) %>%
gather (Samples, Expression, -1) %>%
filter (Pt %in% orthogroups$Pt) %>%
separate(Samples, into = c("Trees", "Samples"), sep = "\\.") %>%
mutate_at("Samples", as.numeric) %>%
mutate(Expression = log2(Expression+1))
#plot(density(aspwood_expr$Expression), xlab = "log2(TPM)")
aspwood_expr %>%
group_by(Pt) %>%
summarise(Max = max(Expression)) -> max_expr
d <- tibble(Trees = paste0("T", c(rep(1,3),rep(2,3),rep(3,3),rep(4,3))),
xintercept = c(5.5, 12.5, 19.5, 5.5, 11.5, 19.5, 5.5, 14.5, 21.5, 5.5, 12.5, 20.5))
plots <- list()
for (orth in candidate_orthogroups) {
candidates_pt <- orthogroups %>%
filter(OrthoGroup == orth) %>%
left_join(max_expr, by = c("Pt")) %>%
arrange(desc(Max)) %>%
dplyr::slice(1)
if (nrow(candidates_pt) > 0) { # No Pt gene in orthogroup?
orth_sym <- sym(orth)
plots[[length(plots)+1]] <- ggplot(data, aes(x = Habit, y = !!orth_sym, col = Habit)) +
theme_bw() + theme() +
geom_boxplot(outlier.shape = NA) +
geom_jitter(color="black", size=1, alpha=0.9, shape=16, width = 0.1, height = 0)
descr <- candidates_pt$Descr
candidates_pt <- candidates_pt$Pt
plots[[length(plots)+1]] <- aspwood_expr %>% filter(Pt == candidates_pt) %>%
ggplot(aes(x = Samples, y = Expression, col = Trees)) +
geom_line() +
geom_vline(data = d, mapping = aes(xintercept = xintercept), linetype="dashed",
color = "gray68", linewidth=0.5) +
facet_grid(cols = vars(Trees)) +
theme(legend.position = "none") +
ggtitle(label = paste(orth, " - ", candidates_pt, sep = ""),
subtitle = descr)
}
}
plot_grid(plotlist = plots, ncol = 2)